Introduction

The data for the project comes from a kaggle dataset on spotify music that a user made to try to classify whether or not he would like a song.

library(lattice)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(matrixStats)
data = read.csv("~/Desktop/STAT434Data/data.csv", row.names = "X")
drops = c("song_title", "artist")
datafull = data
data=data[, !(names(data) %in% drops)]
data$target = factor(data$target)

The response variable in the dataset is the “target” variable which is a 0 or 1 encoding of whether or not the creator of the dataset liked that song

Exploratory Data Analysis

length(datafull$song_title)
## [1] 2017
length(unique(datafull$artist))
## [1] 1343
sapply(data, class)
##     acousticness     danceability      duration_ms           energy 
##        "numeric"        "numeric"        "integer"        "numeric" 
## instrumentalness              key         liveness         loudness 
##        "numeric"        "integer"        "numeric"        "numeric" 
##             mode      speechiness            tempo   time_signature 
##        "integer"        "numeric"        "numeric"        "numeric" 
##          valence           target 
##        "numeric"         "factor"

The listener gave data on 2017 different songs, by 1343 unique artists. The data types of the variables in the dataset are above.

colMeans(data[ ,-c( 6, 9, 12, 14, 15, 16)])
##     acousticness     danceability      duration_ms           energy 
##     1.875900e-01     6.184219e-01     2.463062e+05     6.815771e-01 
## instrumentalness         liveness         loudness      speechiness 
##     1.332855e-01     1.908440e-01    -7.085624e+00     9.266425e-02 
##            tempo          valence 
##     1.216033e+02     4.968150e-01
colSds(data.matrix(data[ ,-c(6, 9, 12, 14, 15, 16)]))
##  [1] 2.599893e-01 1.610290e-01 8.198181e+04 2.102730e-01 2.731622e-01
##  [6] 1.554532e-01 3.761684e+00 8.993146e-02 2.668560e+01 2.471955e-01
print("Key")
## [1] "Key"
table(data$key)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11 
## 216 257 184  63 105 166 159 212 136 191 141 187
print("Mode")
## [1] "Mode"
table(data$mode)
## 
##    0    1 
##  782 1235
print("Time Signature")
## [1] "Time Signature"
table(data$time_signature)
## 
##    1    3    4    5 
##    1   93 1891   32
print("Target")
## [1] "Target"
table(data$target)
## 
##    0    1 
##  997 1020

Variable Importance, Selection, and Reduction

set.seed (10)
train1=sample(c(TRUE,FALSE), nrow(data),rep=TRUE)
train = data[train1,]
test1 = (! train1)
test = data[test1,]
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
rf.vars = randomForest(target~., data = train, importance= TRUE)
importance(rf.vars)
##                           0          1 MeanDecreaseAccuracy
## acousticness     22.7797930 -1.1344129           18.9798922
## danceability     18.4507816  6.8442835           18.5481233
## duration_ms      22.6531085 10.7353023           23.6135536
## energy           25.7303143  1.2139683           23.5494097
## instrumentalness 35.0416144 20.0670530           37.7967988
## key              -0.9876866  2.0393756            0.8358347
## liveness          2.9130851 -1.7029870            0.6036024
## loudness         32.5664149 11.9819470           33.9648429
## mode              2.6464599  1.4590492            2.8578427
## speechiness      15.6402011 21.8281660           26.5126741
## tempo             8.2384417 -0.4943151            5.6558706
## time_signature    2.4647040 -0.6570962            1.2882844
## valence          19.9035118  4.8412638           18.5942548
##                  MeanDecreaseGini
## acousticness            45.241211
## danceability            43.506377
## duration_ms             52.382558
## energy                  48.879434
## instrumentalness        55.665570
## key                     19.957803
## liveness                32.088025
## loudness                62.800576
## mode                     5.391236
## speechiness             53.057914
## tempo                   35.693373
## time_signature           2.092565
## valence                 44.477957
varImpPlot(rf.vars)

Using the importance from random forests, there is a clear drop off in mean decrease accuracy after the inclusion of the first 8 variables. The three categorical variables key, mode, and time signature are among the least important using this procedure.

library(leaps)
regfit.bwd=regsubsets (target~.,data=train ,
method ="backward", nvmax = 13)

regfit.fwd=regsubsets (target~.,data=data ,
method ="forward", nvmax = 13)

regfit.ex=regsubsets (target~.,data=data ,
method ="exhaustive", nvmax = 13)
which.min(summary(regfit.bwd)$cp)
## [1] 7
which.min(summary(regfit.fwd)$cp)
## [1] 10
which.min(summary(regfit.ex)$cp)
## [1] 10
which.max(summary(regfit.bwd)$adjr2)
## [1] 11
which.max(summary(regfit.fwd)$adjr2)
## [1] 10
which.max(summary(regfit.ex)$adjr2)
## [1] 10
summary(regfit.ex)
## Subset selection object
## Call: regsubsets.formula(target ~ ., data = data, method = "exhaustive", 
##     nvmax = 13)
## 13 Variables  (and intercept)
##                  Forced in Forced out
## acousticness         FALSE      FALSE
## danceability         FALSE      FALSE
## duration_ms          FALSE      FALSE
## energy               FALSE      FALSE
## instrumentalness     FALSE      FALSE
## key                  FALSE      FALSE
## liveness             FALSE      FALSE
## loudness             FALSE      FALSE
## mode                 FALSE      FALSE
## speechiness          FALSE      FALSE
## tempo                FALSE      FALSE
## time_signature       FALSE      FALSE
## valence              FALSE      FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
##           acousticness danceability duration_ms energy instrumentalness
## 1  ( 1 )  " "          "*"          " "         " "    " "             
## 2  ( 1 )  " "          "*"          " "         " "    "*"             
## 3  ( 1 )  " "          "*"          " "         " "    "*"             
## 4  ( 1 )  "*"          "*"          " "         " "    "*"             
## 5  ( 1 )  "*"          "*"          " "         " "    "*"             
## 6  ( 1 )  "*"          "*"          "*"         " "    "*"             
## 7  ( 1 )  "*"          "*"          "*"         " "    "*"             
## 8  ( 1 )  "*"          "*"          "*"         " "    "*"             
## 9  ( 1 )  "*"          "*"          "*"         " "    "*"             
## 10  ( 1 ) "*"          "*"          "*"         " "    "*"             
## 11  ( 1 ) "*"          "*"          "*"         "*"    "*"             
## 12  ( 1 ) "*"          "*"          "*"         "*"    "*"             
## 13  ( 1 ) "*"          "*"          "*"         "*"    "*"             
##           key liveness loudness mode speechiness tempo time_signature
## 1  ( 1 )  " " " "      " "      " "  " "         " "   " "           
## 2  ( 1 )  " " " "      " "      " "  " "         " "   " "           
## 3  ( 1 )  " " " "      " "      " "  "*"         " "   " "           
## 4  ( 1 )  " " " "      " "      " "  "*"         " "   " "           
## 5  ( 1 )  " " " "      "*"      " "  "*"         " "   " "           
## 6  ( 1 )  " " " "      "*"      " "  "*"         " "   " "           
## 7  ( 1 )  " " " "      "*"      " "  "*"         " "   " "           
## 8  ( 1 )  " " " "      "*"      " "  "*"         "*"   " "           
## 9  ( 1 )  " " " "      "*"      "*"  "*"         "*"   " "           
## 10  ( 1 ) " " "*"      "*"      "*"  "*"         "*"   " "           
## 11  ( 1 ) " " "*"      "*"      "*"  "*"         "*"   " "           
## 12  ( 1 ) "*" "*"      "*"      "*"  "*"         "*"   " "           
## 13  ( 1 ) "*" "*"      "*"      "*"  "*"         "*"   "*"           
##           valence
## 1  ( 1 )  " "    
## 2  ( 1 )  " "    
## 3  ( 1 )  " "    
## 4  ( 1 )  " "    
## 5  ( 1 )  " "    
## 6  ( 1 )  " "    
## 7  ( 1 )  "*"    
## 8  ( 1 )  "*"    
## 9  ( 1 )  "*"    
## 10  ( 1 ) "*"    
## 11  ( 1 ) "*"    
## 12  ( 1 ) "*"    
## 13  ( 1 ) "*"

The forward selection procedure and exhaustive best subsets are in agreement with each other. Using Mallow’s CP and adjusted R^2 as statistics for model quality, best subsets selected the best 10 variable model out of 13 possible predictor variables. The predictors left out in the best 10 variable mode are energy, key, and time signature. Of these, the latter two are two of the three categorical variables.

library(Matrix)
library(glmnet)
## Loading required package: foreach
## Loaded glmnet 2.0-16
xTrain = model.matrix(target~.,data=train)#[,length(train)-1]
xTest = model.matrix(target~.,data=test)#[,length(test)-1]
yTrain = train$target
yTest = test$target

bestL = cv.glmnet(xTrain, yTrain, family = "binomial")$lambda.min

cv.out=cv.glmnet(xTrain,yTrain,family = "binomial",alpha=1)
bestL2= bestlam=cv.out$lambda.min

lasso.mod=glmnet(xTrain,yTrain,alpha=1,family = "binomial",lambda=bestL2)
lasso.pred=predict(lasso.mod,s=bestL2,newx=xTest, type = "class")
mean(lasso.pred!= test$target)
## [1] 0.3712871
table(lasso.pred, test$target)
##           
## lasso.pred   0   1
##          0 310 174
##          1 201 325
predict(lasso.mod,type="coefficients",s=bestL2)
## 15 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)      -2.548315e+00
## (Intercept)       .           
## acousticness     -1.346831e+00
## danceability      6.695807e-01
## duration_ms       2.978270e-06
## energy            7.190558e-01
## instrumentalness  8.858914e-01
## key               2.287857e-02
## liveness         -1.137519e-01
## loudness         -1.023331e-01
## mode             -1.701086e-01
## speechiness       4.262528e+00
## tempo             1.539804e-03
## time_signature   -1.854267e-01
## valence           1.155356e+00

Fitting a random forest for variable importance, a best subsets, and a LASSO regression, variables which were less important in general were excluded moving forward. More weight was given to the best subsets and random forest importance as they are able to capture more complexity than the lasso regression which is strictly linear. Key is mostly deemed unimportant and time signature and mode are only considered somewhat important in LASSO. For this reason, these will be removed, meaning only numeric variables are remaining. Lastly, liveness will be dropped from the analysis because it is considered rather unimportant in LASSO and random forest, and not especially important in best subsets.

drops = c("key", "liveness", "mode", "time_signature")
data=data[, !(names(data) %in% drops)]

Now that variable selection has been done, the remaining variables will be standardized and prepared for PCA.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data = data %>% mutate_each_(funs(scale(.)%>% as.vector), vars = c("acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "loudness", "speechiness", "tempo", "valence"))
## Warning: mutate_each() is deprecated
## Please use mutate_if(), mutate_at(), or mutate_all() instead: 
## 
##   - To map `funs` over all variables, use mutate_all()
##   - To map `funs` over a selection of variables, use mutate_at()
## This warning is displayed once per session.
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.

The block below fits a PCA to the data. The scree plot then helps visualize how much variance is explained by each PC.

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed (2)
pr.out=prcomp(data[-10], scale=TRUE)

pr.var=pr.out$sdev ^2
pve=pr.var/sum(pr.var)
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained ", ylim=c(0,1),type="b")

The PCA here is not very useful as the elbow at 2 PC’s only accounts for only about 50% of the variance being explained. There is no significant point beyond this where there is a good balance between dimension reduction and explanation of variability. Therefore, it is not worth sacrificing the interpretibility to force this forward through model fitting.

Training and Testing Sets

As the number following the train/test set increases, more data is in the test set and less is in training set.

set.seed (45)
train90=sample( nrow(data),size = dim(data)[1]*0.90)
train70=sample( nrow(data),size = dim(data)[1]*0.70)
train50=sample( nrow(data),size = dim(data)[1]*0.50)
train30=sample( nrow(data),size = dim(data)[1]*0.30)
train10=sample( nrow(data),size = dim(data)[1]*0.10)
train1 = data[train90,]
test1 = data[-train90,]

train2 = data[train70,]
test2 = data[-train70,]

train3 = data[train50,]
test3 = data[-train50,]

train4 = data[train30,]
test4 = data[-train30,]

train5 = data[train10,]
test5 = data[-train10,]

Each classification algorithm will be run with train/test splits of 90/10, 70/30, 50/50, 30/70, and 10/90. The outcome of choosing to use these sizes of data partitions will be considered after each algorithm is tuned to have the optimal parameters, trained, and told to predict on the unseen test data. The quality of the models will be measured by test error and its flexibility/appropriateness.

Model Fitting

Logistic Regression

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
logReg1 = glm(target~., data  = train1, family = "binomial")
logReg2 = glm(target~., data  = train2, family = "binomial")
logReg3 = glm(target~., data  = train3, family = "binomial")
logReg4 = glm(target~., data  = train4, family = "binomial")
logReg5 = glm(target~., data  = train5, family = "binomial")

preds1 = predict(logReg1, test1, type = "response")
preds1 = ifelse(preds1 > 0.5, 1, 0)
preds2 = predict(logReg2, test2)
preds2 = ifelse(preds2 > 0.5, 1, 0)
preds3 = predict(logReg3, test3)
preds3 = ifelse(preds3 > 0.5, 1, 0)
preds4 = predict(logReg4, test4)
preds4 = ifelse(preds4 > 0.5, 1, 0)
preds5 = predict(logReg5, test5)
preds5 = ifelse(preds5 > 0.5, 1, 0)


mean(preds1!=test1$target)
## [1] 0.3514851
mean(preds2!=test2$target)
## [1] 0.3811881
mean(preds3!=test3$target)
## [1] 0.3756194
mean(preds4!=test4$target)
## [1] 0.3916431
mean(preds5!=test5$target)
## [1] 0.3981278

The logistic regression model here has about a 4.5% difference in test error between the models fit on the most training data and the least training data. This is not a massive range, but it does indicate that there is some sensitivity to the size of the training dataset. The more data the model is trained on here, the better that it will perform on predicting new data, though this is a true for all models and should be seen throughout, but it especially applies for logistic regression. A logistic regression model is typically reliant on a high sample size of the data. It work well considering there are only 2 classes for target, but high error may indicate a more complex separation.

LDA

lda.fit = lda(target~.,data = train1)
lda.pred = predict(lda.fit, test1)
lda.class = lda.pred$class
table(lda.class, test1$target)
##          
## lda.class  0  1
##         0 67 35
##         1 35 65
mean(lda.class != test1$target)
## [1] 0.3465347
lda.fit = lda(target~.,data = train2)
lda.pred = predict(lda.fit, test2)
lda.class = lda.pred$class
table(lda.class, test2$target)
##          
## lda.class   0   1
##         0 201 117
##         1  94 194
mean(lda.class != test2$target)
## [1] 0.3481848
lda.fit = lda(target~.,data = train2)
lda.pred = predict(lda.fit, test2)
lda.class = lda.pred$class
table(lda.class, test2$target)
##          
## lda.class   0   1
##         0 201 117
##         1  94 194
mean(lda.class != test2$target)
## [1] 0.3481848
lda.fit = lda(target~.,data = train3)
lda.pred = predict(lda.fit, test4)
lda.class = lda.pred$class
table(lda.class, test4$target)
##          
## lda.class   0   1
##         0 472 262
##         1 205 473
mean(lda.class != test4$target)
## [1] 0.3307365
lda.fit = lda(target~.,data = train5)
lda.pred = predict(lda.fit, test5)
lda.class = lda.pred$class
table(lda.class, test5$target)
##          
## lda.class   0   1
##         0 590 373
##         1 306 547
mean(lda.class != test5$target)
## [1] 0.3738987

In LDA, there is an interesting occurance in that the partition of 30% training data is the set which has the lowest test error. Other than this, the test error is a bit lower than in logistic regression, with a similar sensitivity to the size of the training data. The fact that the 30% training data partition has the lowest test error could be indicitive of just a high variability in the data. The approximation of the Bayes classifier here in LDA makes an improvement upon using the version in logistic regression, but not by a significant amount. In QDA, it will be seen if greater flexibility will help in capturing a more complex relation between the predictors and the response.

QDA

qda.fit = qda(target~.,data = train1)
qda.pred = predict(qda.fit, test1)
qda.class = qda.pred$class
table(qda.class, test1$target)
##          
## qda.class  0  1
##         0 85 44
##         1 17 56
mean(qda.class != test1$target)
## [1] 0.3019802
qda.fit = qda(target~.,data = train2)
qda.pred = predict(qda.fit, test2)
qda.class = qda.pred$class
table(qda.class, test2$target)
##          
## qda.class   0   1
##         0 246 134
##         1  49 177
mean(qda.class != test2$target)
## [1] 0.3019802
qda.fit = qda(target~.,data = train3)
qda.pred = predict(qda.fit, test3)
qda.class = qda.pred$class
table(qda.class, test3$target)
##          
## qda.class   0   1
##         0 421 208
##         1  82 298
mean(qda.class != test3$target)
## [1] 0.2874133
qda.fit = qda(target~.,data = train4)
qda.pred = predict(qda.fit, test4)
qda.class = qda.pred$class
table(qda.class, test4$target)
##          
## qda.class   0   1
##         0 517 291
##         1 160 444
mean(qda.class != test4$target)
## [1] 0.3194051
qda.fit = qda(target~.,data = train5)
qda.pred = predict(qda.fit, test5)
qda.class = qda.pred$class
table(qda.class, test5$target)
##          
## qda.class   0   1
##         0 687 346
##         1 209 574
mean(qda.class != test5$target)
## [1] 0.3056167

In QDA, there is a solid improvement made upon correctly classifying whether or not the listener liked a song or not, accross training sizes. There is also a more narrow range of test errors, indicating that the more flexible QDA is less sensative to the size the train/test split it is trained on. This is not to say there is not variability in the classification error, but the model will perform a bit more similarly on test data given various sizes of training data. Something else interesting to note here is that the the 30% training data partition which had the lowest LDA error has the highest QDA error, meaning that it is possible in the random sampling that the 30% training partition was chosen as more linearly seprable points when in reality there are relations in the data which need to be accounted for with more flexible methods.

KNN

library(e1071)
library(class)

set.seed(100)
train = train1
train.x = train[,-10]
train.y = train[,10]
test = test1
test.x = test[,-10]
test.y = test[,10]
knn.tune = tune.knn(train.x, train.y, k = 1:100)
knn.tune$best.parameters
knn.pred = knn(train.x, test.x, train.y, k = knn.tune$best.parameters)
table(knn.pred, test.y)
##         test.y
## knn.pred  0  1
##        0 81 39
##        1 21 61
mean(knn.pred != test.y)
## [1] 0.2970297
set.seed(100)
train = train2
train.x = train[,-10]
train.y = train[,10]
test = test2
test.x = test[,-10]
test.y = test[,10]
knn.tune = tune.knn(train.x, train.y, k = 1:100)
knn.tune$best.parameters
knn.pred = knn(train.x, test.x, train.y, k = knn.tune$best.parameters)
table(knn.pred, test.y)
##         test.y
## knn.pred   0   1
##        0 245 118
##        1  50 193
mean(knn.pred != test.y)
## [1] 0.2772277
set.seed(100)
train = train3
train.x = train[,-10]
train.y = train[,10]
test = test3
test.x = test[,-10]
test.y = test[,10]
knn.tune = tune.knn(train.x, train.y, k = 1:100)
knn.tune$best.parameters
knn.pred = knn(train.x, test.x, train.y, k = knn.tune$best.parameters)
table(knn.pred, test.y)
##         test.y
## knn.pred   0   1
##        0 404 210
##        1  99 296
mean(knn.pred != test.y)
## [1] 0.3062438
set.seed(100)
train = train4
train.x = train[,-10]
train.y = train[,10]
test = test4
test.x = test[,-10]
test.y = test[,10]
knn.tune = tune.knn(train.x, train.y, k = 1:100)
knn.tune$best.parameters
knn.pred = knn(train.x, test.x, train.y, k = knn.tune$best.parameters)
table(knn.pred, test.y)
##         test.y
## knn.pred   0   1
##        0 552 299
##        1 125 436
mean(knn.pred != test.y)
## [1] 0.3002833
set.seed(100)
train = train5
train.x = train[,-10]
train.y = train[,10]
test = test5
test.x = test[,-10]
test.y = test[,10]
knn.tune = tune.knn(train.x, train.y, k = 1:100)
knn.tune$best.parameters
knn.pred = knn(train.x, test.x, train.y, k = knn.tune$best.parameters)
table(knn.pred, test.y)
##         test.y
## knn.pred   0   1
##        0 656 404
##        1 240 516
mean(knn.pred != test.y)
## [1] 0.3546256

Models found from KNN have similar test errors to QDA, largely around 30% with 27.7% on the low end and 35.5% on the high end. In general, this is fairly sensitive to the amount of training data as lower errors were found when we used larger amounts of training data. The largest error occurs when we only have 10% of the data as training, however the lowest test error occurs with the 70/30 training test split. This may be due to dis-proportionate noise in the 90/10 split or other variation in these partitions.

The K selected by cross validation is also very sensitive to the amount of training data. As the amount of training data decreased, so did K from 28 with 90% of the data all the way down to 4 when trained on only 10% of the data. With more training data and a higher K, we are aggregating more and decreasing variance which may better approximate the true relationship, thus produce a lower test error.

Random Forest

set.seed(50)
train = train1
train.x = train[,-10]
train.y = train[,10]
test = test1
test.x = test[,-10]
test.y = test[,10]
tune.rf = tune.randomForest(train.x, train.y, mtry = seq(1, 9, 2), ntree = seq(300, 900, 200),
                            tunecontrol = tune.control(sampling = "cross", cross = 3))
summary(tune.rf)
## 
## Parameter tuning of 'randomForest':
## 
## - sampling method: 3-fold cross validation 
## 
## - best parameters:
##  mtry ntree
##     1   900
## 
## - best performance: 0.2198347 
## 
## - Detailed performance results:
##    mtry ntree     error dispersion
## 1     1   300 0.2203857 0.02339490
## 2     3   300 0.2314050 0.02110272
## 3     5   300 0.2319559 0.01857820
## 4     7   300 0.2336088 0.01570973
## 5     9   300 0.2363636 0.02597725
## 6     1   500 0.2253444 0.02150878
## 7     3   500 0.2264463 0.02867661
## 8     5   500 0.2319559 0.02157220
## 9     7   500 0.2336088 0.02079844
## 10    9   500 0.2314050 0.02148760
## 11    1   700 0.2247934 0.02434863
## 12    3   700 0.2231405 0.02400965
## 13    5   700 0.2264463 0.02071068
## 14    7   700 0.2374656 0.02244123
## 15    9   700 0.2380165 0.01652893
## 16    1   900 0.2198347 0.02495813
## 17    3   900 0.2269972 0.02573068
## 18    5   900 0.2308540 0.01736193
## 19    7   900 0.2319559 0.01655645
## 20    9   900 0.2336088 0.02397169
yhat.rf = predict(tune.rf$best.model, newdata = test)
table(yhat.rf, test$target)
##        
## yhat.rf  0  1
##       0 80 23
##       1 22 77
mean(yhat.rf != test$target)
## [1] 0.2227723
set.seed(50)
train = train2
train.x = train[,-10]
train.y = train[,10]
test = test2
test.x = test[,-10]
test.y = test[,10]
tune.rf = tune.randomForest(train.x, train.y, mtry = seq(1, 9, 2), ntree = seq(300, 900, 200),
                            tunecontrol = tune.control(sampling = "cross", cross = 3))
summary(tune.rf)
## 
## Parameter tuning of 'randomForest':
## 
## - sampling method: 3-fold cross validation 
## 
## - best parameters:
##  mtry ntree
##     1   300
## 
## - best performance: 0.2182922 
## 
## - Detailed performance results:
##    mtry ntree     error dispersion
## 1     1   300 0.2182922 0.04138387
## 2     3   300 0.2246766 0.04159193
## 3     5   300 0.2289244 0.04079208
## 4     7   300 0.2267922 0.03314792
## 5     9   300 0.2360046 0.03517091
## 6     1   500 0.2282182 0.04879282
## 7     3   500 0.2324690 0.03708080
## 8     5   500 0.2345891 0.02489324
## 9     7   500 0.2352999 0.02957244
## 10    9   500 0.2367093 0.03309538
## 11    1   700 0.2190104 0.05013162
## 12    3   700 0.2289289 0.03771241
## 13    5   700 0.2267953 0.03164075
## 14    7   700 0.2274970 0.03089053
## 15    9   700 0.2289139 0.03198875
## 16    1   900 0.2239734 0.05309700
## 17    3   900 0.2260936 0.04234426
## 18    5   900 0.2239674 0.03648571
## 19    7   900 0.2345876 0.03415273
## 20    9   900 0.2338739 0.03421124
yhat.rf = predict(tune.rf$best.model, newdata = test)
table(yhat.rf, test$target)
##        
## yhat.rf   0   1
##       0 232  81
##       1  63 230
mean(yhat.rf != test$target)
## [1] 0.2376238
set.seed(50)
train = train3
train.x = train[,-10]
train.y = train[,10]
test = test3
test.x = test[,-10]
test.y = test[,10]
tune.rf = tune.randomForest(train.x, train.y, mtry = seq(1, 9, 2), ntree = seq(300, 900, 200), tunecontrol = tune.control(sampling = "cross", cross = 3))
summary(tune.rf)
## 
## Parameter tuning of 'randomForest':
## 
## - sampling method: 3-fold cross validation 
## 
## - best parameters:
##  mtry ntree
##     1   900
## 
## - best performance: 0.2212302 
## 
## - Detailed performance results:
##    mtry ntree     error  dispersion
## 1     1   300 0.2321429 0.011904762
## 2     3   300 0.2222222 0.012028131
## 3     5   300 0.2251984 0.006195435
## 4     7   300 0.2301587 0.007489915
## 5     9   300 0.2311508 0.022337957
## 6     1   500 0.2271825 0.014064927
## 7     3   500 0.2232143 0.014880952
## 8     5   500 0.2281746 0.016391579
## 9     7   500 0.2301587 0.012390869
## 10    9   500 0.2361111 0.018901348
## 11    1   700 0.2242063 0.016391579
## 12    3   700 0.2232143 0.015748520
## 13    5   700 0.2242063 0.013420386
## 14    7   700 0.2301587 0.019134228
## 15    9   700 0.2301587 0.016391579
## 16    1   900 0.2212302 0.014979830
## 17    3   900 0.2222222 0.006195435
## 18    5   900 0.2291667 0.018103460
## 19    7   900 0.2321429 0.007874260
## 20    9   900 0.2341270 0.015272623
yhat.rf = predict(tune.rf$best.model, newdata = test)
table(yhat.rf, test$target)
##        
## yhat.rf   0   1
##       0 387 122
##       1 116 384
mean(yhat.rf != test$target)
## [1] 0.2358771
set.seed(50)
train = train4
train.x = train[,-10]
train.y = train[,10]
test = test4
test.x = test[,-10]
test.y = test[,10]
tune.rf = tune.randomForest(train.x, train.y, mtry = seq(1, 9, 2), ntree = seq(300, 900, 200), tunecontrol = tune.control(sampling = "cross", cross = 3))
summary(tune.rf)
## 
## Parameter tuning of 'randomForest':
## 
## - sampling method: 3-fold cross validation 
## 
## - best parameters:
##  mtry ntree
##     1   700
## 
## - best performance: 0.2363759 
## 
## - Detailed performance results:
##    mtry ntree     error  dispersion
## 1     1   300 0.2496100 0.013153476
## 2     3   300 0.2446513 0.019202259
## 3     5   300 0.2578773 0.018420800
## 4     7   300 0.2562271 0.018033939
## 5     9   300 0.2628360 0.013880551
## 6     1   500 0.2463015 0.011024488
## 7     3   500 0.2479352 0.005001290
## 8     5   500 0.2496100 0.015701469
## 9     7   500 0.2578609 0.018045120
## 10    9   500 0.2678029 0.017973763
## 11    1   700 0.2363759 0.006413107
## 12    3   700 0.2479763 0.022355812
## 13    5   700 0.2479763 0.022355812
## 14    7   700 0.2644944 0.016714752
## 15    9   700 0.2612104 0.028100805
## 16    1   900 0.2479517 0.009313936
## 17    3   900 0.2446513 0.013140303
## 18    5   900 0.2463015 0.013059638
## 19    7   900 0.2545605 0.008277812
## 20    9   900 0.2694531 0.016728815
yhat.rf = predict(tune.rf$best.model, newdata = test)
table(yhat.rf, test$target)
##        
## yhat.rf   0   1
##       0 533 192
##       1 144 543
mean(yhat.rf != test$target)
## [1] 0.2379603
set.seed(50)
train = train5
train.x = train[,-10]
train.y = train[,10]
test = test5
test.x = test[,-10]
test.y = test[,10]
tune.rf = tune.randomForest(train.x, train.y, mtry = seq(1, 9, 2), ntree = seq(50, 150, 25), tunecontrol = tune.control(sampling = "cross", cross = 3, error.fun = ))
summary(tune.rf)
## 
## Parameter tuning of 'randomForest':
## 
## - sampling method: 3-fold cross validation 
## 
## - best parameters:
##  mtry ntree
##     1    75
## 
## - best performance: 0.2537313 
## 
## - Detailed performance results:
##    mtry ntree     error  dispersion
## 1     1    50 0.3034826 0.047978362
## 2     3    50 0.3134328 0.039488826
## 3     5    50 0.3283582 0.014925373
## 4     7    50 0.3233831 0.022798884
## 5     9    50 0.3034826 0.008617168
## 6     1    75 0.2537313 0.051703009
## 7     3    75 0.3333333 0.047978362
## 8     5    75 0.2985075 0.025851505
## 9     7    75 0.2835821 0.039488826
## 10    9    75 0.3184080 0.034468673
## 11    1   100 0.2985075 0.025851505
## 12    3   100 0.3034826 0.008617168
## 13    5   100 0.3084577 0.017234336
## 14    7   100 0.2885572 0.017234336
## 15    9   100 0.3084577 0.067302235
## 16    1   125 0.2985075 0.053814198
## 17    3   125 0.3283582 0.025851505
## 18    5   125 0.3333333 0.031069642
## 19    7   125 0.2985075 0.014925373
## 20    9   125 0.2985075 0.014925373
## 21    1   150 0.3233831 0.031069642
## 22    3   150 0.3184080 0.022798884
## 23    5   150 0.2935323 0.008617168
## 24    7   150 0.2985075 0.025851505
## 25    9   150 0.3233831 0.037561365
yhat.rf = predict(tune.rf$best.model, newdata = test)
table(yhat.rf, test$target)
##        
## yhat.rf   0   1
##       0 667 308
##       1 229 612
mean(yhat.rf != test$target)
## [1] 0.2957048

For random forests, we wanted to validate both the number of variables considered at each split, and the total number of trees. The training sets have observations numbering from ~200 to ~1800 so we adjusted the considered number of trees to be realistic for each model without exponentially increasing computation time. There is no clear pattern as to how the number of trees were chosen with some training sets choosing higher, lower, and moderate sizes for number of trees. Fot the number of variables, all random forest models selected the number of variables considered for each split to be 1. This may be because there is not a clear “best” way to split the data, and simply using more variables regardless of their impact at a particular place in the tree gives results that best represents the true relationship.

The test error rates here are quite low compared to the other methods so far ranging from 22.3 to 29.6%. Again, the highest error rate corresponded to the 10/90 model – the one with the least training data. Random forests however does appear to be less sensitive to the size of the training data as all other models had very similar test errors all around 23%.

Boosting

###Will need to validate number of trees and number of variables for each tree and lambda

library("caret")
## 
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
## 
##     R2
library("gbm")
## Loaded gbm 2.1.5
paramGrid = expand.grid(n.trees=seq(100,1000,100), shrinkage = c(0.01, 0.05, 0.075, 0.1) ,interaction.depth = c(1, 2, 3, 4), n.minobsinnode=10)
trainControl <- trainControl(method="cv", number=10)

gbmMod =train(target~ ., data=train1, distribution="bernoulli", method="gbm",
              trControl=trainControl, verbose=FALSE, 
              tuneGrid=paramGrid)
print("Optimal Number of Trees")
## [1] "Optimal Number of Trees"
gbmMod$finalModel$n.trees
## [1] 300
print("Optimal Shrinkage")
## [1] "Optimal Shrinkage"
gbmMod$finalModel$shrinkage
## [1] 0.05
print("Interaction Depth")
## [1] "Interaction Depth"
gbmMod$finalModel$interaction.depth
## [1] 4
yhat.boost=predict(gbmMod,newdata=test1, n.trees = gbmMod$finalModel$n.trees)
print("Test Error")
## [1] "Test Error"
mean(yhat.boost!=test1$target)
## [1] 0.2425743
summary(gbmMod)

The split with 90% training data shows speechiness as the most important, with instrumentalness not too far behind.

gbmMod =train(target~ ., data=train2, distribution="bernoulli", method="gbm",
              trControl=trainControl, verbose=FALSE, 
              tuneGrid=paramGrid)
print("Optimal Number of Trees")
## [1] "Optimal Number of Trees"
gbmMod$finalModel$n.trees
## [1] 300
print("Optimal Shrinkage")
## [1] "Optimal Shrinkage"
gbmMod$finalModel$shrinkage
## [1] 0.075
print("Interaction Depth")
## [1] "Interaction Depth"
gbmMod$finalModel$interaction.depth
## [1] 4
yhat.boost=predict(gbmMod,newdata=test2, n.trees = gbmMod$finalModel$n.trees)
mean(yhat.boost!=test2$target)
## [1] 0.2557756
summary(gbmMod)

The split with 70% training data again shows speechiness as the most important, with instrumentalness not too far behind. The importance of the variables below these have slightly shifted positions.

gbmMod =train(target~ ., data=train3, distribution="bernoulli", method="gbm",
              trControl=trainControl, verbose=FALSE, 
              tuneGrid=paramGrid)
print("Optimal Number of Trees")
## [1] "Optimal Number of Trees"
gbmMod$finalModel$n.trees
## [1] 400
print("Optimal Shrinkage")
## [1] "Optimal Shrinkage"
gbmMod$finalModel$shrinkage
## [1] 0.1
print("Interaction Depth")
## [1] "Interaction Depth"
gbmMod$finalModel$interaction.depth
## [1] 4
yhat.boost=predict(gbmMod,newdata=test3, n.trees = gbmMod$finalModel$n.trees)
print("Test Error")
## [1] "Test Error"
mean(yhat.boost!=test3$target)
## [1] 0.2626363
summary(gbmMod)

The split with 50% training data again shows speechiness as the most important. The importance of the variables below these have slightly shifted positions.

gbmMod =train(target~ ., data=train4, distribution="bernoulli", method="gbm",
              trControl=trainControl, verbose=FALSE, 
              tuneGrid=paramGrid)
print("Optimal Number of Trees")
## [1] "Optimal Number of Trees"
gbmMod$finalModel$n.trees
## [1] 600
print("Optimal Shrinkage")
## [1] "Optimal Shrinkage"
gbmMod$finalModel$shrinkage
## [1] 0.01
print("Interaction Depth")
## [1] "Interaction Depth"
gbmMod$finalModel$interaction.depth
## [1] 3
yhat.boost=predict(gbmMod,newdata=test4, n.trees = gbmMod$finalModel$n.trees)
print("Test Error")
## [1] "Test Error"
mean(yhat.boost!=test4$target)
## [1] 0.259915
summary(gbmMod)

The split with 30% training data shows instrumentalness as the most important variable by far.

gbmMod =train(target~ ., data=train5, distribution="bernoulli", method="gbm",
              trControl=trainControl, verbose=FALSE, 
              tuneGrid=paramGrid)
print("Optimal Number of Trees")
## [1] "Optimal Number of Trees"
gbmMod$finalModel$n.trees
## [1] 100
print("Optimal Shrinkage")
## [1] "Optimal Shrinkage"
gbmMod$finalModel$shrinkage
## [1] 0.05
print("Interaction Depth")
## [1] "Interaction Depth"
gbmMod$finalModel$interaction.depth
## [1] 2
yhat.boost=predict(gbmMod,newdata=test5, n.trees = gbmMod$finalModel$n.trees)
print("Test Error")
## [1] "Test Error"
mean(yhat.boost!=test5$target)
## [1] 0.3199339
summary(gbmMod)

The split with 10% training data shows loudness and acousticness as some of the most important variables.

When boosting was applied to the model, there was a large amount of sensitivity to the size of the training data. When the 90% training data partition was considered, the model had one of the lowest test errors amongst all models considered. The tuning parameters considered were number of trees, shrinkage, and interaction depth. In general, the models are all performing better with deeper interaction depths, and this is true especially true when there is more data as cross validation chose the interaction depth of 4. The shrinkage parameter does not seem to have too polarized of a tendency, but number of trees does. The models with more training data seem to be optimized with more trees (in general), so it may be needed then to account for complexity. In conjunction with this, the models with more training data performed better and had lower test error. There was greater test error for models which had been trained on less data, so it exhibits that boosting does have significant sensitivity to the amount of data it was trained on.

SVM

##Linear Kernel

train = train1
test = test1

tune.out <- tune(svm, target ~ ., data = train, kernel = "linear", 
            ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.3344424 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.3558982 0.04339863
## 2 1e-02 0.3366493 0.02640946
## 3 1e-01 0.3355443 0.02913604
## 4 1e+00 0.3344424 0.02706749
## 5 5e+00 0.3355473 0.02627035
## 6 1e+01 0.3360998 0.02557907
## 7 1e+02 0.3360998 0.02687229
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred  0  1
##     0 74 41
##     1 28 59
mean(ypred != test$target)
## [1] 0.3415842
plot(bestmod, train, loudness ~ instrumentalness)

On the 90% training dataset cost was equal to 1

train = train2
test = test2

tune.out <- tune(svm, target ~ ., data = train, kernel = "linear", 
            ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.328149 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.3564729 0.04693787
## 2 1e-02 0.3380881 0.05346923
## 3 1e-01 0.3281490 0.05305271
## 4 1e+00 0.3281490 0.05123056
## 5 5e+00 0.3281490 0.05123056
## 6 1e+01 0.3288583 0.05014795
## 7 1e+02 0.3288583 0.05014795
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 223 125
##     1  72 186
mean(ypred != test$target)
## [1] 0.3250825
plot(bestmod, train, loudness ~ instrumentalness)

On the 70% training dataset cost was equal to 0.1

train = train3
test = test3

tune.out <- tune(svm, target ~ ., data = train, kernel = "linear", 
            ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.3381881 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.4790000 0.06755108
## 2 1e-02 0.3441683 0.03455389
## 3 1e-01 0.3431485 0.04489463
## 4 1e+00 0.3381881 0.04376666
## 5 5e+00 0.3401683 0.04363855
## 6 1e+01 0.3401683 0.04363855
## 7 1e+02 0.3401683 0.04363855
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 398 225
##     1 105 281
mean(ypred != test$target)
## [1] 0.3270565
plot(bestmod, train, loudness ~ instrumentalness)

On the 50% dataset cost was equal to 1

train = train4
test = test4

tune.out <- tune(svm, target ~ ., data = train, kernel = "linear", 
            ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.3438798 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.4710109 0.06354996
## 2 1e-02 0.3570219 0.05207084
## 3 1e-01 0.3471585 0.05033872
## 4 1e+00 0.3438798 0.05027012
## 5 5e+00 0.3439071 0.04991641
## 6 1e+01 0.3439071 0.04991641
## 7 1e+02 0.3439071 0.04991641
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 532 351
##     1 145 384
mean(ypred != test$target)
## [1] 0.3512748
plot(bestmod, train, loudness ~ instrumentalness)

On the 30% dataset cost was equal to 1

train = train5
test = test5

tune.out <- tune(svm, target ~ ., data = train, kernel = "linear", 
            ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.2933333 
## 
## - Detailed performance results:
##    cost     error dispersion
## 1 1e-03 0.5573810 0.05285321
## 2 1e-02 0.3928571 0.09147320
## 3 1e-01 0.2933333 0.08541836
## 4 1e+00 0.3183333 0.08181958
## 5 5e+00 0.3283333 0.07496913
## 6 1e+01 0.3283333 0.07496913
## 7 1e+02 0.3283333 0.07496913
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 632 391
##     1 264 529
mean(ypred != test$target)
## [1] 0.3606828
plot(bestmod, train, loudness ~ instrumentalness)

On the 10% dataset cost was equal to 0.1

For all datasets with a linear kernel, cost was either 0.1 or 1. Test error was 34.2%, 32.5%, 32.7%, 35.1%, and 36.1% for 90/10 through 10/90 training/test datasets respectively. There does not seem to be a clear relationship between training data size and test error, though the medium sized training sets did perform the best in this case.

##Radial Kernel

train = train1
test = test1

tune.out <- tune(svm, target ~ ., data = train, kernel = "radial", 
            ranges = list(cost = c(0.01, 0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1, 10)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.1
## 
## - best performance: 0.2424716 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1   0.01  0.01 0.4930818 0.03726867
## 2   0.10  0.01 0.3162983 0.03615761
## 3   1.00  0.01 0.2948060 0.02859747
## 4  10.00  0.01 0.2639670 0.04207417
## 5   0.01  0.10 0.4065782 0.06949052
## 6   0.10  0.10 0.2804808 0.03465537
## 7   1.00  0.10 0.2424716 0.04067874
## 8  10.00  0.10 0.2457653 0.03576592
## 9   0.01  1.00 0.4930818 0.03726867
## 10  0.10  1.00 0.3807237 0.02586574
## 11  1.00  1.00 0.2644679 0.03236362
## 12 10.00  1.00 0.2804201 0.04231009
## 13  0.01 10.00 0.4930818 0.03726867
## 14  0.10 10.00 0.4930818 0.03726867
## 15  1.00 10.00 0.4258636 0.04900410
## 16 10.00 10.00 0.4076893 0.05209789
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred  0  1
##     0 82 31
##     1 20 69
mean(ypred != test$target)
## [1] 0.2524752
plot(bestmod, train, loudness ~ instrumentalness)

The 90% radial model had cost = 1 and gamma = 0.1

train = train2
test = test2

tune.out <- tune(svm, target ~ ., data = train, kernel = "radial", 
            ranges = list(cost = c(0.01, 0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1, 10)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.1
## 
## - best performance: 0.2502048 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1   0.01  0.01 0.5400609 0.02623240
## 2   0.10  0.01 0.3316702 0.04411305
## 3   1.00  0.01 0.2969633 0.02340806
## 4  10.00  0.01 0.2551493 0.02279937
## 5   0.01  0.10 0.5173659 0.03427797
## 6   0.10  0.10 0.2806563 0.03107623
## 7   1.00  0.10 0.2502048 0.02760796
## 8  10.00  0.10 0.2587054 0.02900911
## 9   0.01  1.00 0.5400609 0.02623240
## 10  0.10  1.00 0.3890920 0.03097871
## 11  1.00  1.00 0.2806613 0.02830181
## 12 10.00  1.00 0.2877285 0.02600784
## 13  0.01 10.00 0.5400609 0.02623240
## 14  0.10 10.00 0.5400609 0.02623240
## 15  1.00 10.00 0.4521826 0.07119616
## 16 10.00 10.00 0.4373090 0.07699513
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 236 103
##     1  59 208
mean(ypred != test$target)
## [1] 0.2673267
plot(bestmod, train, loudness ~ instrumentalness)

The 70% radial model had cost = 1 and gamma = 0.1

train = train3
test = test3

tune.out <- tune(svm, target ~ ., data = train, kernel = "radial", 
            ranges = list(cost = c(0.01, 0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1, 10)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.1
## 
## - best performance: 0.237099 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1   0.01  0.01 0.4900891 0.03251796
## 2   0.10  0.01 0.3214455 0.02865904
## 3   1.00  0.01 0.2996436 0.04354283
## 4  10.00  0.01 0.2648614 0.03539057
## 5   0.01  0.10 0.4900891 0.03251796
## 6   0.10  0.10 0.2887327 0.04296137
## 7   1.00  0.10 0.2370990 0.04946339
## 8  10.00  0.10 0.2500594 0.03380248
## 9   0.01  1.00 0.4900891 0.03251796
## 10  0.10  1.00 0.4196733 0.04412357
## 11  1.00  1.00 0.2550000 0.03685354
## 12 10.00  1.00 0.2698812 0.03903089
## 13  0.01 10.00 0.4900891 0.03251796
## 14  0.10 10.00 0.4900891 0.03251796
## 15  1.00 10.00 0.4385050 0.05815149
## 16 10.00 10.00 0.4255941 0.05815693
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 419 176
##     1  84 330
mean(ypred != test$target)
## [1] 0.2576809
plot(bestmod, train, loudness ~ instrumentalness)

The 50% radial model had cost = 1 and gamma = 0.1

train = train4
test = test4

tune.out <- tune(svm, target ~ ., data = train, kernel = "radial", 
            ranges = list(cost = c(0.01, 0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1, 10)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10   0.1
## 
## - best performance: 0.2645082 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1   0.01  0.01 0.4712022 0.06015842
## 2   0.10  0.01 0.4662568 0.06529772
## 3   1.00  0.01 0.3423770 0.04918867
## 4  10.00  0.01 0.3175410 0.05056422
## 5   0.01  0.10 0.4712022 0.06015842
## 6   0.10  0.10 0.3290164 0.04828329
## 7   1.00  0.10 0.2762022 0.05935016
## 8  10.00  0.10 0.2645082 0.05916707
## 9   0.01  1.00 0.4712022 0.06015842
## 10  0.10  1.00 0.4712022 0.06015842
## 11  1.00  1.00 0.3021858 0.06895214
## 12 10.00  1.00 0.3236339 0.07462794
## 13  0.01 10.00 0.4712022 0.06015842
## 14  0.10 10.00 0.4712022 0.06015842
## 15  1.00 10.00 0.4596175 0.04808860
## 16 10.00 10.00 0.4596448 0.04520187
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 507 213
##     1 170 522
mean(ypred != test$target)
## [1] 0.2712465
plot(bestmod, train, loudness ~ instrumentalness)

The 30% radial model had cost = 10 and gamma = 0.1

train = train5
test = test5

tune.out <- tune(svm, target ~ ., data = train, kernel = "radial", 
            ranges = list(cost = c(0.01, 0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1, 10)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10  0.01
## 
## - best performance: 0.2880952 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1   0.01  0.01 0.5671429 0.06668367
## 2   0.10  0.01 0.5671429 0.06668367
## 3   1.00  0.01 0.3478571 0.05881001
## 4  10.00  0.01 0.2880952 0.10007556
## 5   0.01  0.10 0.5671429 0.06668367
## 6   0.10  0.10 0.4976190 0.09747376
## 7   1.00  0.10 0.3128571 0.12466736
## 8  10.00  0.10 0.3380952 0.11880929
## 9   0.01  1.00 0.5671429 0.06668367
## 10  0.10  1.00 0.5671429 0.06668367
## 11  1.00  1.00 0.3778571 0.10975390
## 12 10.00  1.00 0.3828571 0.11684631
## 13  0.01 10.00 0.5671429 0.06668367
## 14  0.10 10.00 0.5671429 0.06668367
## 15  1.00 10.00 0.5671429 0.07072671
## 16 10.00 10.00 0.5671429 0.07072671
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 700 383
##     1 196 537
mean(ypred != test$target)
## [1] 0.3188326
plot(bestmod, train, loudness ~ instrumentalness)

The 10% radial model had cost = 10 and gamma = 0.1

All radial-kernel models chose a gamma = 0.1 which is close to the typical default of 1/9 (0.111). While the 30/70 and 10/90 models choose a larger cost = 10 indicating that more data will be considered in these models where there is not much data to begin with. Test errors were as follows: 25.2%, 26.7%, 25.8%, 27.2%, and 31.8%. Again, there is no clear relationship between the amount of training data and the test error, however at a sufficiently small training dataset size, the predictive capabilities of this model do decrease.

##Polynomial 2-Degree Kernel

train = train1
test = test1

tune.out <- tune(svm, target ~ ., data = train, kernel = "polynomial", degree = 2,
            ranges = list(cost = c(0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     1
## 
## - best performance: 0.2920011 
## 
## - Detailed performance results:
##   cost gamma     error dispersion
## 1  0.1  0.01 0.4782588 0.06032662
## 2  1.0  0.01 0.4573189 0.03956203
## 3 10.0  0.01 0.3371775 0.04205975
## 4  0.1  0.10 0.3371775 0.04205975
## 5  1.0  0.10 0.3013721 0.03341770
## 6 10.0  0.10 0.2941898 0.03158900
## 7  0.1  1.00 0.2941898 0.03158900
## 8  1.0  1.00 0.2920011 0.03042190
## 9 10.0  1.00 0.2941989 0.02866208
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred  0  1
##     0 81 47
##     1 21 53
mean(ypred != test$target)
## [1] 0.3366337
plot(bestmod, train, loudness ~ instrumentalness)

The 90% polynomial model had cost = 1 and gamma = 1

train = train2
test = test2

tune.out <- tune(svm, target ~ ., data = train, kernel = "polynomial", degree = 2,
            ranges = list(cost = c(0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10   0.1
## 
## - best performance: 0.2933773 
## 
## - Detailed performance results:
##   cost gamma     error dispersion
## 1  0.1  0.01 0.5223055 0.04331132
## 2  1.0  0.01 0.4691090 0.03981443
## 3 10.0  0.01 0.3472880 0.04189390
## 4  0.1  0.10 0.3472880 0.04189390
## 5  1.0  0.10 0.2976576 0.01760823
## 6 10.0  0.10 0.2933773 0.02654088
## 7  0.1  1.00 0.2933773 0.02654088
## 8  1.0  1.00 0.2976176 0.03168654
## 9 10.0  1.00 0.2954900 0.03372718
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 243 136
##     1  52 175
mean(ypred != test$target)
## [1] 0.310231
plot(bestmod, train, loudness ~ instrumentalness)

The 70% polynomial model had cost = 10 and gamma = 0.1

train = train3
test = test3

tune.out <- tune(svm, target ~ ., data = train, kernel = "polynomial", degree = 2,
            ranges = list(cost = c(0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10     1
## 
## - best performance: 0.2867426 
## 
## - Detailed performance results:
##   cost gamma     error dispersion
## 1  0.1  0.01 0.4901188 0.04562164
## 2  1.0  0.01 0.4534455 0.04916907
## 3 10.0  0.01 0.3205248 0.05282009
## 4  0.1  0.10 0.3205248 0.05282009
## 5  1.0  0.10 0.2947030 0.03675084
## 6 10.0  0.10 0.2877426 0.04460625
## 7  0.1  1.00 0.2877426 0.04460625
## 8  1.0  1.00 0.2867426 0.04771356
## 9 10.0  1.00 0.2867426 0.04861812
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 399 207
##     1 104 299
mean(ypred != test$target)
## [1] 0.308226
plot(bestmod, train, loudness ~ instrumentalness)

The 50% polynomial model had cost = 10 and gamma = 1

train = train4
test = test4

tune.out <- tune(svm, target ~ ., data = train, kernel = "polynomial", degree = 2,
            ranges = list(cost = c(0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     1
## 
## - best performance: 0.309153 
## 
## - Detailed performance results:
##   cost gamma     error dispersion
## 1  0.1  0.01 0.4710383 0.04269548
## 2  1.0  0.01 0.4710383 0.04269548
## 3 10.0  0.01 0.4048907 0.05454156
## 4  0.1  0.10 0.4048907 0.05454156
## 5  1.0  0.10 0.3191257 0.05460487
## 6 10.0  0.10 0.3190984 0.06091548
## 7  0.1  1.00 0.3190984 0.06091548
## 8  1.0  1.00 0.3091530 0.05781058
## 9 10.0  1.00 0.3107923 0.06215462
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 508 299
##     1 169 436
mean(ypred != test$target)
## [1] 0.3314448
plot(bestmod, train, loudness ~ instrumentalness)

The 30% polynomial model had cost = 1 and gamma = 1

train = train5
test = test5

tune.out <- tune(svm, target ~ ., data = train, kernel = "polynomial", degree = 2,
            ranges = list(cost = c(0.1, 1, 10),
                          gamma = c(0.01, 0.1, 1)))

summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10   0.1
## 
## - best performance: 0.3278571 
## 
## - Detailed performance results:
##   cost gamma     error dispersion
## 1  0.1  0.01 0.5473810  0.0533514
## 2  1.0  0.01 0.5473810  0.0533514
## 3 10.0  0.01 0.4583333  0.1192181
## 4  0.1  0.10 0.4583333  0.1192181
## 5  1.0  0.10 0.3385714  0.1081802
## 6 10.0  0.10 0.3278571  0.1034249
## 7  0.1  1.00 0.3278571  0.1034249
## 8  1.0  1.00 0.3823810  0.1363103
## 9 10.0  1.00 0.3721429  0.1075627
bestmod <- tune.out$best.model
ypred <- predict(bestmod, test)

table(ypred, test$target)
##      
## ypred   0   1
##     0 659 437
##     1 237 483
mean(ypred != test$target)
## [1] 0.3711454
plot(bestmod, train, loudness ~ instrumentalness)

The 10% polynomial model had cost = 10 and gamma = 0.1

The values for cost and gamma seem to fluctuate much more when using a 2nd degree polynomial kernel however there is not a clear relationship between them and the training data size.

Teh test error rates for these models are 33.6%, 31.0%, 30.8%, 33.1% and 37.1%. Once again this appears to follow the pattern of high error on the smallest training set and relatively consistent test errors for other sizes of training data. These error rates are generally higher than the radial kernel, but smaller than linear.

Neural Network

#Will need to validate all of the different parameters 
levels(train1$target) = make.names(levels(factor(train1$target)))
levels(test1$target) = make.names(levels(factor(test1$target)))

levels(train2$target) = make.names(levels(factor(train2$target)))
levels(test2$target) = make.names(levels(factor(test2$target)))

levels(train3$target) = make.names(levels(factor(train3$target)))
levels(test3$target) = make.names(levels(factor(test3$target)))

levels(train4$target) = make.names(levels(factor(train4$target)))
levels(test4$target) = make.names(levels(factor(test4$target)))

levels(train5$target) = make.names(levels(factor(train5$target)))
levels(test5$target) = make.names(levels(factor(test5$target)))

fitControl <- trainControl(method = "cv",number =10,classProbs =TRUE)
nnetGrid <-  expand.grid(size = c(seq(10, 100, 20)), decay = c(0, 0.1, 0.2, 0.4))
nnMod <- train(target~., data=train1, method='nnet', trace = F,trControl = fitControl, tuneGrid = nnetGrid)
print("Optimal Hidden Layers/Weights")
## [1] "Optimal Hidden Layers/Weights"
nnMod$finalModel
## a 9-90-1 network with 991 weights
## inputs: acousticness danceability duration_ms energy instrumentalness loudness speechiness tempo valence 
## output(s): .outcome 
## options were - entropy fitting  decay=0.4
print("Optimal Decay")
## [1] "Optimal Decay"
nnMod$finalModel$decay
## [1] 0.4
yhat.nn <- predict(nnMod, newdata = test1)
print("Test Error")
## [1] "Test Error"
mean(yhat.nn!=test1$target)
## [1] 0.2277228

On the 90% training dataset, the best model had a 90 node hidden layer network with 991 weights.

nnMod <- train(target~., data=train2, method='nnet', trace = F,trControl = fitControl, tuneGrid = nnetGrid)
print("Optimal Hidden Layers/Weights")
## [1] "Optimal Hidden Layers/Weights"
nnMod$finalModel
## a 9-70-1 network with 771 weights
## inputs: acousticness danceability duration_ms energy instrumentalness loudness speechiness tempo valence 
## output(s): .outcome 
## options were - entropy fitting  decay=0.4
print("Optimal Decay")
## [1] "Optimal Decay"
nnMod$finalModel$decay
## [1] 0.4
yhat.nn <- predict(nnMod, newdata = test2)
print("Test Error")
## [1] "Test Error"
mean(yhat.nn!=test2$target)
## [1] 0.2557756

On the 70% training dataset, the best model had a 70 hidden layer network with 771 weights.

nnMod <- train(target~., data=train3, method='nnet', trace = F,trControl = fitControl, tuneGrid = nnetGrid)
print("Optimal Hidden Layers/Weights")
## [1] "Optimal Hidden Layers/Weights"
nnMod$finalModel
## a 9-50-1 network with 551 weights
## inputs: acousticness danceability duration_ms energy instrumentalness loudness speechiness tempo valence 
## output(s): .outcome 
## options were - entropy fitting  decay=0.4
print("Optimal Decay")
## [1] "Optimal Decay"
nnMod$finalModel$decay
## [1] 0.4
yhat.nn <- predict(nnMod, newdata = test3)
print("Test Error")
## [1] "Test Error"
mean(yhat.nn!=test3$target)
## [1] 0.2537166

On the 50% training dataset, the best model had a 50 hidden layer network with 551 weights.

nnMod <- train(target~., data=train4, method='nnet', trace = F,trControl = fitControl, tuneGrid = nnetGrid)
print("Optimal Hidden Layers/Weights")
## [1] "Optimal Hidden Layers/Weights"
nnMod$finalModel
## a 9-90-1 network with 991 weights
## inputs: acousticness danceability duration_ms energy instrumentalness loudness speechiness tempo valence 
## output(s): .outcome 
## options were - entropy fitting  decay=0.4
print("Optimal Decay")
## [1] "Optimal Decay"
nnMod$finalModel$decay
## [1] 0.4
yhat.nn <- predict(nnMod, newdata = test4)
print("Test Error")
## [1] "Test Error"
mean(yhat.nn!=test4$target)
## [1] 0.2889518

On the 30% training dataset, the best model had a 90 hidden layer network with 991 weights.

nnMod <- train(target~., data=train5, method='nnet', trace = F,trControl = fitControl, tuneGrid = nnetGrid)
print("Optimal Hidden Layers/Weights")
## [1] "Optimal Hidden Layers/Weights"
nnMod$finalModel
## a 9-50-1 network with 551 weights
## inputs: acousticness danceability duration_ms energy instrumentalness loudness speechiness tempo valence 
## output(s): .outcome 
## options were - entropy fitting  decay=0.1
print("Optimal Decay")
## [1] "Optimal Decay"
nnMod$finalModel$decay
## [1] 0.1
yhat.nn <- predict(nnMod, newdata = test5)
print("Test Error")
## [1] "Test Error"
mean(yhat.nn!=test5$target)
## [1] 0.3270925

On the 10% training dataset, the best model had a 50 hidden layer network with 551 weights.

The neural network when fit to the various sizes of training data showed a strong amount of senstivitiy. It performs pretty well in comparison to some other methods because there are more tuning parameters to adjust (here optimization is somewhat limited by runtime practicality, as with some other models) and this again, allows for more complex relationships to be captured. In general, neural nets with more nodes in the hidden layer on every training size perform better. For the larger training set sizes, having a greater decay value (or regularization parameter) is best, but it is seems to prefer a smaller decay on the smaller test sizes. The sensitivity to the size of the training data is more significant than some of the other models. Overall, training on a good amount of data with a large regulariztion parameter and a large number of hidden layer nodes is the best way to minimize test MSE in neural networks.

Conclusion

## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
Test Error Rates across methods
Method % of Data in Training Test Error
Logistic 90 0.3514851
Logistic 70 0.3811881
Logistic 50 0.3756194
Logistic 30 0.3916431
Logistic 10 0.3981278
LDA 90 0.3465347
LDA 70 0.3481848
LDA 50 0.3481848
LDA 30 0.3307365
LDA 10 0.3738987
QDA 90 0.3019802
QDA 70 0.3019802
QDA 50 0.2874133
QDA 30 0.3194051
QDA 10 0.3056167
KNN 90 0.2970297
KNN 70 0.2772277
KNN 50 0.3062438
KNN 30 0.3002833
KNN 10 0.3546256
Random Forest 90 0.2227723
Random Forest 70 0.2376238
Random Forest 50 0.2358771
Random Forest 30 0.2379603
Random Forest 10 0.2957048
Boosting 90 0.2425743
Boosting 70 0.2557756
Boosting 50 0.2626363
Boosting 30 0.2599150
Boosting 10 0.3199339
SVM-Linear 90 0.3415842
SVM-Linear 70 0.3250825
SVM-Linear 50 0.3270565
SVM-Linear 30 0.3512748
SVM-Linear 10 0.3606828
SVM-Radial 90 0.2524752
SVM-Radial 70 0.2673267
SVM-Radial 50 0.2576809
SVM-Radial 30 0.2712465
SVM-Radial 10 0.3188326
SVM-2nd Degree Polynomial 90 0.3366337
SVM-2nd Degree Polynomial 70 0.3102310
SVM-2nd Degree Polynomial 50 0.3082260
SVM-2nd Degree Polynomial 30 0.3314448
SVM-2nd Degree Polynomial 10 0.3711454
Neural Net 90 0.2277228
Neural Net 70 0.2557756
Neural Net 50 0.2537166
Neural Net 30 0.2889518
Neural Net 10 0.3270925

The results of testing various classification models on various sizes of the train/test split using optimized parameters when appropriate indicate that there is a reasonable amount of non-linear complexity in determining based off of the remaining predictors whether or not the listener liked a song. As shown above, Random Forests ultimately provided the best model with respect to the test error at 90% training data with a 22.27% test error.For all types of models save for LDA and QDA, the optimal solution was found when the amount of data in the training set was 70% or higher. While some models were more sensitive to these changes in the amount of training data LDA and QDA stood out as being some of the least sensitive to the size of the training set. Random Forests were also relatively insensitive to the training data size up until only 10% of the data was used as train. Conversely the model type most sensative to changes in training size is the neural network. With these ideas in mind, appropriate models can be chosen and tuned to fit the situation the data presents. In the listener’s dataset, it appears that there were some complex relationships which needed to be accounted for by more flexible models, so a well tuned random forest did the best job and proved to argueably be the most robust.